In [1]:
#OS level tools
import os
import time
import itertools
from collections import defaultdict
from glob import glob
import psutil
from functools import partial
import re
from __future__ import print_function

#array and data structure
import numpy as np
import pandas as pd
#import seaborn as sb

#Ipython display and widgets
#import ipywidgets as widgets
from IPython.display import Image, HTML, display
from IPython.display import Markdown as md
#from ipywidgets import interact_manual

#holoviews and plotting
import holoviews as hv
import datashader as ds
from holoviews.operation.datashader import aggregate, shade, datashade
from bokeh.models import HoverTool
#from holoviews.operation import decimate

#dask parallelization
import dask.dataframe as dd
from dask import compute, delayed
import dask.threaded
import dask.multiprocessing

#tsne
from MulticoreTSNE import MulticoreTSNE as TSNE
tsne = TSNE(n_jobs=24)

#color assignment
cmap_all=['white','white']
cmap_parent=['black','grey']
cmap_pop=(['darkgreen','lightgreen'], ['darkorange','yellow'], ['purple','blueviolet'], ['darkblue','lightblue'], ['indianred','red'])
for i in range(5):
    cmap_pop=cmap_pop+cmap_pop
background = '#D3D3D3'

#export path assignment
#scratch_path='/scratch/'+os.environ['USER']+'/'+os.environ['SLURM_JOBID']
scratch_path="."
export_path="PNG"
png_path="PNG"
try:
    os.makedirs(export_path)
except OSError as e:
    if e.errno != os.errno.EEXIST:
        raise

#export = partial(export_image, export_path=export_path, background=background)

hv.notebook_extension('bokeh')
#display(HTML("<style>.container { width:100% !important; overflow-x: auto;white-space: nowrap;}</style>"))
hv.opts("RGB [toolbar=None, width=400, height=400, bgcolor='#D3D3D3', fontsize={'title':8, 'xlabel':8, 'ylabel':8, 'ticks':3}]")
In [2]:
import ipywidgets as widgets
box_layout = widgets.Layout(overflow_x='scroll',
                    border='3px solid black',
                    height='',
                    flex_flow='column',
                    display='flex')
row_layout = widgets.Layout(min_width='32000px')
In [3]:
hv.opts("RGB [toolbar=None, width=400, height=400, bgcolor='#D3D3D3', fontsize={'title':15, 'xlabel':10, 'ylabel':10, 'ticks':5}]")
In [4]:
def config_objects(s):
    try:
        with open(s) as config_file:
            config_file.seek(0)
            gates={}
            for line in config_file:
                phenoType=""
                line = line.strip()
                gate = line.split("\t")
                if len(gate)==12:
                    phenoType=gate[11]
                gates.update({"pop"+str(gate[0]):[int(gate[0]), int(gate[1]), int(gate[2]), int(gate[3]), int(gate[4]), int(gate[5]), int(gate[6]), int(gate[7]), int(gate[8]), int(gate[9]), int(gate[10]), phenoType]})
            return gates
    except:
        raise Exception("Error parsing configuration file")

def config_summary(s, h):
    try:
        with open(s) as config_file:
            config_file.seek(0)
            gates={}
            for line in config_file:
                phenoType=""
                line = line.strip()
                gate = line.split("\t")
                xmarker=str(h[int(gate[1])-1])
                ymarker=str(h[int(gate[2])-1])
                startx=int((float(gate[3])/200)*4096)
                starty=int((float(gate[5])/200)*4096)
                endx=int((float(gate[4])/200)*4096)
                endy=int((float(gate[6])/200)*4096)
                parent="pop"+gate[7]
                if len(gate)==12:
                    phenoType=gate[11]
                gates.update({"pop"+str(gate[0]):[int(gate[0]), parent, xmarker, ymarker, phenoType, startx, endx, starty, endy]})
            return gates
    except:
        raise Exception("Error parsing configuration file")

_nsre = re.compile('([0-9]+)')
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)]

def natural_sort(l):
    #https://stackoverflow.com/a/4836734/846892
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    return sorted(l, key = alphanum_key)

def label_color (pops, row):
    eventcolor=0
    for i, pop in enumerate(pops):
        if row[pop]==0:
            eventcolor=i+1
    return eventcolor

def label_color2 (pops, row):
    eventcolor="base"
    for i, pop in enumerate(pops):
        if row[pop]==0:
            eventcolor=pop
    return eventcolor

def parseCohort(s):
    cohort_file=open(s)

    return

def parseDataFrame(s):
    result_file=open(s)
    sampleLabel=os.path.splitext(s)[0]
    events = sum(1 for line in result_file) -1 #quickly determine number of events
    result_file.seek(0) #rewind file to beginning
    header = result_file.readline()
    header = header.strip()
    headers = header.split("\t")

    pop_offset=len(headers)
    popList=[]
    for i,header in enumerate(headers):
        if header == "pop1":
            pop_offset=i
        if "pop" in header:
            popList.append(header)
    markers = headers[0:pop_offset]
    result_file.seek(0) #rewind file to beginning

    df = pd.read_csv(s, sep='\t')
    dataIndex={}
    for i,header in enumerate(headers):
        dataIndex.update({header:i})
    df['pop0']=0
    return [sampleLabel,headers,markers,popList,df]

def parseDAFi(s):
    df = pd.read_csv(s, sep='\t')
    df['pop0']=0
    return df

def html_row(file):
     return '<img src="{}" style="display:inline;margin:1px" title="{}"/>'.format(export_path+"/"+file+".png",file,file)
In [5]:
display(md("# Automated Analysis Report with Static Composite Dot Plots"))

Automated Analysis Report with Static Composite Dot Plots

In [6]:
titlefilename=glob('description.txt')
if titlefilename:
    titlefile=open(titlefilename[0])
    title=titlefile.readline()
    titlefile.close()
    display(md("# Dataset: %s"%(title)))
else:
    display(md("# Dataset: no description given"))

Dataset: this time using Asian dataset

In [7]:
%%output backend='bokeh'
%%opts Table [width=1200]
metadatafilename=glob('metadata.txt')
if metadatafilename:
    metadatafile=open(metadatafilename[0])
    metaheader = metadatafile.readline()
    metaheader = metaheader.strip()
    metaheaders = metaheader.split("\t")
    metaDf=pd.read_csv('metadata.txt', sep='\t')
    metaTable=hv.Table(metaDf)
    display(md("## Metadata"))
    display(metaTable)
else:
    display(md("## No metadata info given"))

Metadata

In [8]:
gatedFiles=sorted(glob('Gated/*/flock*.txt'))
gatedDelayed=[[(os.path.split(os.path.dirname(fn))[1]),delayed(parseDAFi)(fn)] for fn in gatedFiles]
sample_labels=[os.path.split(os.path.dirname(fn))[1] for fn in gatedFiles]
dfArray=compute(*gatedDelayed, get=dask.threaded.get)
In [9]:
headers=list(dfArray[0][1])
pop_offset=len(headers)
popList=[]
for i,header in enumerate(headers):
    if header == "pop1":
        pop_offset=i
    if "pop" in header:
        popList.append(header)
markers = headers[0:pop_offset]

DAFi Configuration

In [10]:
%%output backend='bokeh'
%%opts Table.gates [width=1200]
%%opts Table.summary [width=1200]
configLabel="pipeline.config"
gates=config_objects(configLabel)
num_gates = len(gates)
summary=config_summary(configLabel, headers)
num_gates = len(summary)

gatesummary = [v for v in summary.values()]
di = {summary.get(element)[0]:str(summary.get(element)[0]).zfill(2)+"_"+summary.get(element)[4] for i,element in enumerate(summary)}
summaryTable=hv.Table(gatesummary,kdims=['Population','Parent','XMarker','YMarker','phenotype','startx', 'endx', 'starty', 'endy'], group='summary', label='Summary')

sortedTable=summaryTable.sort('Population')
sortedTable
Out[10]:

Marker and Axis Tables

In [11]:
%%output backend='bokeh'
axis_popIndexDict = defaultdict(list)
popBounds={}
axises=[]
composite_axis=0
last_xmarker=""
last_ymarker=""
last_parent=0
gatesconfig=[]
for i in range(len(gates)):
    pop="pop"+str(i+1)
    config=gates.get(pop)
    xmarker=str(headers[config[1]-1])
    ymarker=str(headers[config[2]-1])
    startx=int((float(config[3])/200)*4096)
    starty=int((float(config[5])/200)*4096)
    endx=int((float(config[4])/200)*4096)
    endy=int((float(config[6])/200)*4096)
    parent=int(config[7])
    ctype=int(config[8])
    popBounds.update({pop:[xmarker, ymarker, startx,starty,endx,endy,ctype,"pop"+str(parent)]})
    key="axis"+str(composite_axis).zfill(2)
    if (xmarker != last_xmarker) or (ymarker != last_ymarker) or (parent != last_parent):
        composite_axis=composite_axis+1
        key="axis"+str(composite_axis).zfill(2)
        axises.append([xmarker,ymarker,key,"pop"+str(parent)])
    axis_popIndexDict[key].append(pop)
    last_xmarker=xmarker
    last_ymarker=ymarker
    last_parent=parent
    gatesconfig.append([pop,xmarker,ymarker,parent])

num_axises = len(axises)
markerTable=hv.Table(markers,kdims=['Marker'])
axis_popTable=hv.Table(axis_popIndexDict, kdims=['Axis Index'], vdims=['sub populations'])
markerTable+axis_popTable.sort('Axis Index')
Out[11]:
In [12]:
hv.notebook_extension('matplotlib')

Population Percentage and Events Tables

In [13]:
batchpercent_df = pd.read_csv('Gated/Batch_percentages.txt', sep='\t', index_col=0)
batchpercent_df=batchpercent_df.rename(di).round(2)
with pd.option_context('display.max_columns', None):
    display(batchpercent_df)
1015_Combo 1016_Combo 1017_Combo 1018_Combo 1023_Combo 1024_Combo 1025_Combo 1029_Combo 1031_Combo 1032_Combo 1033_Combo 1036_Combo
01_ 34.23 30.89 37.89 23.58 10.56 14.64 7.74 33.45 26.43 14.44 25.68 23.70
02_ 99.98 99.92 99.94 99.87 99.45 99.17 96.97 96.44 99.84 98.43 94.49 99.55
03_ 80.73 76.92 76.32 76.35 64.50 81.09 58.37 85.05 71.44 62.44 77.80 69.72
04_ 5.25 12.71 12.26 7.84 27.27 7.46 16.50 6.85 10.56 11.53 9.61 16.81
05_ 12.01 16.23 8.52 10.77 23.38 9.06 11.92 7.76 7.47 12.60 12.10 13.00
06_ 62.96 47.93 67.88 77.16 17.73 67.06 44.83 73.98 82.72 83.24 53.86 60.18
07_ 62.68 46.05 63.94 75.33 17.56 64.29 43.00 72.35 80.30 79.04 46.54 58.13
08_ 36.95 53.88 35.31 24.67 82.40 35.71 56.76 27.51 18.57 20.95 53.18 40.31
09_ 91.20 95.72 90.82 91.21 94.54 83.20 94.21 86.72 84.16 91.80 92.32 95.07
10_ 1.12 0.49 0.60 1.38 1.94 5.15 1.32 3.61 2.19 1.03 1.92 0.58
11_ 7.27 3.58 7.91 5.90 3.48 11.25 4.27 8.60 12.93 7.02 4.83 4.26
12_ 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00
13_ 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00
14_ 4.17 1.24 5.25 6.60 0.96 3.49 12.70 0.84 7.11 13.56 4.13 3.59
In [14]:
batchevents_df = pd.read_csv('Gated/Batch_events.txt', sep='\t', index_col=0)
batchevents_df=batchevents_df.rename(di)
with pd.option_context('display.max_columns', None):
    display(batchevents_df)
1015_Combo 1016_Combo 1017_Combo 1018_Combo 1023_Combo 1024_Combo 1025_Combo 1029_Combo 1031_Combo 1032_Combo 1033_Combo 1036_Combo
01_ 104172 90333 206031 59975 118949 84831 54270 95441 121789 71330 76171 79262
02_ 104147 90262 205900 59899 118289 84123 52624 92047 121598 70212 71975 78908
03_ 84076 69433 157144 45731 76299 68213 30715 78284 86872 43840 55998 55017
04_ 5464 11473 25238 4699 32260 6273 8682 6304 12845 8092 6914 13268
05_ 12512 14650 17546 6454 27654 7619 6272 7147 9078 8844 8712 10256
06_ 7877 7022 11911 4980 4902 5109 2812 5287 7509 7362 4692 6172
07_ 7842 6747 11219 4862 4855 4898 2697 5171 7290 6990 4055 5962
08_ 4623 7894 6196 1592 22787 2721 3560 1966 1686 1853 4633 4134
09_ 4216 7556 5627 1452 21543 2264 3354 1705 1419 1701 4277 3930
10_ 52 39 37 22 442 140 47 71 37 19 89 24
11_ 336 283 490 94 793 306 152 169 218 130 224 176
12_ 52 39 37 22 442 140 47 71 37 19 89 24
13_ 336 283 490 94 793 306 152 169 218 130 224 176
14_ 4345 1117 10814 3953 1135 2939 6681 773 8640 9520 2975 2834

Combined Percent/Events Dataframe

In [15]:
%%output backend="bokeh"
%%opts Table [width=1000]

p_df=pd.DataFrame(batchpercent_df.unstack())
p_df.columns=['Percent']

e_df=pd.DataFrame(batchevents_df.unstack())
e_df.columns=['Events']

c_df=pd.concat([p_df,e_df],axis=1, join='outer').reset_index()
c_df.columns=['Sample','Population','Percent','Events']
c_df=c_df.replace({"Population":di})
In [16]:
%%output backend="bokeh" size=150
%%opts BoxWhisker [xrotation=45]
percentBoxPlot=hv.BoxWhisker(c_df, kdims=['Population'],vdims='Percent').relabel('Population Percent Box Plot')
eventsBoxPlot=hv.BoxWhisker(c_df, kdims=['Population'],vdims='Events').relabel('Population Events Box Plot')
percentBoxPlot+eventsBoxPlot
Out[16]:
In [17]:
# centroidFiles=sorted(glob('Gated/*/cent*.txt'))
# reclusterset=set(["pop"+os.path.basename(fn).split(".")[0].split("centroids")[1]for fn in centroidFiles])
# reclustermap={}
# currentcluster="pop0"
# for j, gate in enumerate(gatesconfig):
#     parent="pop"+str(gate[3])
#     if parent in reclusterset:
#         currentcluster=parent
#     reclustermap.update({gate[0]: currentcluster})
# centDict=dict(((os.path.split(os.path.dirname(fn))[1]+"_pop"+os.path.basename(fn).split(".")[0].split("centroids")[1], parseDAFi(fn)) for fn in centroidFiles))
# #centDict=compute(*centDelayed, get=dask.threaded.get)
# centArray=[[sample, gate[0], centDict.has_key(sample+"_"+reclustermap.get(gate[0]))] for k, sample in enumerate(sample_labels) for j, gate in enumerate(gatesconfig)]

Composite 2D Dot-Plots Gated Populations

In [18]:
poplist=natural_sort(popBounds.keys())
hv.opts("RGB [width=600, height=600, bgcolor='#D3D3D3', fontsize={'title':15, 'xlabel':10, 'ylabel':10, 'ticks':10}]")
In [19]:
hv.opts("Points.cent (color='purple' marker='+' size=10)")
In [20]:
size=400
popdfPlots = hv.HoloMap({(sample, j+1): datashade(hv.Points(dfArray[k][1].loc[dfArray[k][1][gate[0]]==0], kdims=[gate[1], gate[2]]), width=size, height=size, x_range=(0,4096), y_range=(0,4096), dynamic=False, link_inputs=False, cmap=cmap_pop[j])
                    for k, sample in enumerate(sample_labels) for j, gate in enumerate(gatesconfig)}, kdims=['Sample', 'Pop'])
alldfPlots = hv.HoloMap({(sample, j+1): datashade(hv.Points(dfArray[k][1], kdims=[gate[1], gate[2]]), width=size, height=size, x_range=(0,4096), y_range=(0,4096), dynamic=False, link_inputs=False, cmap=cmap_all)
                    for k, sample in enumerate(sample_labels) for j, gate in enumerate(gatesconfig)}, kdims=['Sample', 'Pop'])
parentdfPlots = hv.HoloMap({(sample, j+1): datashade(hv.Points(dfArray[k][1].loc[(dfArray[k][1]["pop"+str(gate[3])]==0) & (dfArray[k][1][gate[0]]==1)], kdims=[gate[1], gate[2]]), width=size, height=size, x_range=(0,4096), y_range=(0,4096), dynamic=False, link_inputs=False, cmap=cmap_parent)
                    for k, sample in enumerate(sample_labels) for j, gate in enumerate(gatesconfig)}, kdims=['Sample', 'Pop'])
boundarydfPlots = hv.HoloMap({(sample, j+1): hv.Bounds((popBounds.get(gate[0])[2], popBounds.get(gate[0])[3], popBounds.get(gate[0])[4], popBounds.get(gate[0])[5])).opts(style=dict(line_color=cmap_pop[j][0],color=cmap_pop[j][0]))
                    for k, sample in enumerate(sample_labels) for j, gate in enumerate(gatesconfig)}, kdims=['Sample', 'Pop'])
# centroiddfPlots = hv.HoloMap({(sample, j+1): hv.Points(centDict.get(sample+"_"+reclustermap.get(gate[0])), kdims=[gate[1], gate[2]], group="cent")
#                     for k, sample in enumerate(sample_labels) for j, gate in enumerate(gatesconfig)}, kdims=['Sample', 'Pop'])
In [21]:
def outputSampleGates(sample):

    filename=export_path+"/"+sample
    hv.output(hv.NdLayout(combineddfPlots[sample,:]).cols(1), backend='matplotlib', size=200, fig='png', filename=filename)
    #print(filename)
    return filename

def outputPopGates(pop):

    filename=export_path+"/"+pop
    length=len(combineddfPlots[:,poplist.index(pop)])
    hv.output(hv.NdLayout(combineddfPlots[:,poplist.index(pop)]).cols(length), backend='matplotlib', size=200, fig='html', filename=filename)

    return filename

def displaySampleGates(sample):

    filename=export_path+"/"+sample
    temp=hv.NdLayout(combineddfPlots[sample,:]).cols(1)

    return temp

def displayPopGates(pop):

    filename=export_path+"/"+pop
    length=len(combineddfPlots[:,poplist.index(pop)])
    temp=hv.NdLayout(combineddfPlots[:,poplist.index(pop)]).cols(length)

    return temp
In [22]:
#hv.opts("RGB [width=600, height=600, bgcolor='#D3D3D3', fontsize={'title':15, 'xlabel':10, 'ylabel':10, 'ticks':10}]")
#temp=hv.output(hv.NdLayout(combineddfPlots).cols(5), backend='matplotlib', size=200, fig='png', filename=export_path+"/sample_composite")
In [23]:
#display(HTML('<iframe src=PNG/pop2.html width=3000 height=350> </iframe>'),HTML('<iframe src=PNG/pop3.html width=3000 height=350></iframe>'))
In [24]:
#%%time 
#testOutput2=[[pop, outputPopGates(pop)] for pop in poplist]
In [25]:
combineddfPlots=alldfPlots*parentdfPlots*popdfPlots*boundarydfPlots
#combineddfPlots=alldfPlots*parentdfPlots*popdfPlots*boundarydfPlots*centroiddfPlots
testOutput=[[sample, outputSampleGates(sample)] for sample in sample_labels]
In [26]:
#html=''.join(html_row(sample) for sample in sample_labels)
#widgets.HTML(value=html, layout=row_layout)
In [27]:
#testOutput2=[[pop, outputPopGates(pop)] for pop in poplist]
#images=[Image(filename = export_path+"/"+pop+".png") for pop in poplist]
#display(*images)
In [28]:
html="".join(html_row(sample) for sample in sample_labels)
html="<div style=\"width:100% !important; overflow-x: auto;white-space: nowrap;\">"+html+"</div>"
display(HTML(html))

tSNE mapping of population percentage

In [29]:
percentdf=batchpercent_df.transpose()
tsne_data_array=percentdf.values.astype(np.float64)
In [30]:
data_tsne = tsne.fit_transform(np.copy(tsne_data_array))
dfn=pd.DataFrame(data_tsne, columns=['tsne-x','tsne-y'], index=percentdf.index)
results=pd.concat([percentdf,dfn],axis=1)
colnames=list(results)[0:-2]
In [31]:
%%output backend='bokeh'
%%opts Scatter.tSNE (size=5 nonselection_color='grey' cmap='Reds') [bgcolor='#D3D3D3' color_index=2 width=400 height=400 tools=['hover','box_select','poly_select']]
%%opts Layout [shared_datasource=True]
labels=[kd for i, kd in enumerate(colnames)]
holomap = hv.HoloMap({(kd): hv.Scatter(results, kdims=['tsne-x','tsne-y'],vdims=[kd], group="tSNE") for i, kd in enumerate(colnames)}, kdims='Population')
hv.Layout(holomap.layout())
Out[31]:
In [32]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[32]: